Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
Thera bank need to come up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards.
import pandas as pd # importing Pandas for dataset import
import matplotlib.pyplot as plt # importing for basic visualization and seaborn
import seaborn as sns # importing for better visualization features
import warnings # importing to suppress the warnings of deprecated features
import numpy as np # importing numpy
warnings.filterwarnings("ignore") # suppress the warnings
# for inline plotting
%matplotlib inline
pd.set_option(
"display.float_format", lambda x: "%.5f" % x
) # to suppress scientific notations
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# To build linear model for prediction
from sklearn.linear_model import LinearRegression
# To check model performance
from sklearn.model_selection import train_test_split
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn import tree
# To tune different models
from sklearn.model_selection import (
GridSearchCV,
train_test_split,
StratifiedKFold,
cross_val_score,
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
BaggingRegressor,
RandomForestRegressor,
GradientBoostingRegressor,
AdaBoostRegressor,
StackingRegressor,
)
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# This will help in making the Python code more structured automatically (good coding practice)
#%load_ext nb_black
#!pip install imblearn
from imblearn.over_sampling import SMOTE
# importing dataset stored in csv file and setting 1st cloumn as index
df = pd.read_csv(
r"C:\Users\user\Desktop\AI_ML_Austin\notebook prog files\ProjectWork5\BankChurners.csv"
)
# creating a copy of the data so that original data remains unchanged
ccpredf1 = df.copy()
ccpredf1.head(10) # display top 10 rows
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.00000 | 777 | 11914.00000 | 1.33500 | 1144 | 42 | 1.62500 | 0.06100 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.00000 | 864 | 7392.00000 | 1.54100 | 1291 | 33 | 3.71400 | 0.10500 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.00000 | 0 | 3418.00000 | 2.59400 | 1887 | 20 | 2.33300 | 0.00000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.00000 | 2517 | 796.00000 | 1.40500 | 1171 | 20 | 2.33300 | 0.76000 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.00000 | 0 | 4716.00000 | 2.17500 | 816 | 28 | 2.50000 | 0.00000 |
| 5 | 713061558 | Existing Customer | 44 | M | 2 | Graduate | Married | $40K - $60K | Blue | 36 | 3 | 1 | 2 | 4010.00000 | 1247 | 2763.00000 | 1.37600 | 1088 | 24 | 0.84600 | 0.31100 |
| 6 | 810347208 | Existing Customer | 51 | M | 4 | NaN | Married | $120K + | Gold | 46 | 6 | 1 | 3 | 34516.00000 | 2264 | 32252.00000 | 1.97500 | 1330 | 31 | 0.72200 | 0.06600 |
| 7 | 818906208 | Existing Customer | 32 | M | 0 | High School | NaN | $60K - $80K | Silver | 27 | 2 | 2 | 2 | 29081.00000 | 1396 | 27685.00000 | 2.20400 | 1538 | 36 | 0.71400 | 0.04800 |
| 8 | 710930508 | Existing Customer | 37 | M | 3 | Uneducated | Single | $60K - $80K | Blue | 36 | 5 | 2 | 0 | 22352.00000 | 2517 | 19835.00000 | 3.35500 | 1350 | 24 | 1.18200 | 0.11300 |
| 9 | 719661558 | Existing Customer | 48 | M | 2 | Graduate | Single | $80K - $120K | Blue | 36 | 6 | 3 | 3 | 11656.00000 | 1677 | 9979.00000 | 1.52400 | 1441 | 32 | 0.88200 | 0.14400 |
Its always better to check the random rows instead of top rows.
np.random.seed(
1
) # random set with seed 1, so the same set of random values can be generated everytime
ccpredf1.sample(n=20) # Print 20 random rows to check the dataset
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6498 | 712389108 | Existing Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Blue | 36 | 6 | 3 | 2 | 2570.00000 | 2107 | 463.00000 | 0.65100 | 4058 | 83 | 0.76600 | 0.82000 |
| 9013 | 718388733 | Existing Customer | 38 | F | 1 | College | NaN | Less than $40K | Blue | 32 | 2 | 3 | 3 | 2609.00000 | 1259 | 1350.00000 | 0.87100 | 8677 | 96 | 0.62700 | 0.48300 |
| 2053 | 710109633 | Existing Customer | 39 | M | 2 | College | Married | $60K - $80K | Blue | 31 | 6 | 3 | 2 | 9871.00000 | 1061 | 8810.00000 | 0.54500 | 1683 | 34 | 0.47800 | 0.10700 |
| 3211 | 717331758 | Existing Customer | 44 | M | 4 | Graduate | Married | $120K + | Blue | 32 | 6 | 3 | 4 | 34516.00000 | 2517 | 31999.00000 | 0.76500 | 4228 | 83 | 0.59600 | 0.07300 |
| 5559 | 709460883 | Attrited Customer | 38 | F | 2 | Doctorate | Married | Less than $40K | Blue | 28 | 5 | 2 | 4 | 1614.00000 | 0 | 1614.00000 | 0.60900 | 2437 | 46 | 0.43800 | 0.00000 |
| 6106 | 789105183 | Existing Customer | 54 | M | 3 | Post-Graduate | Single | $80K - $120K | Silver | 42 | 3 | 1 | 2 | 34516.00000 | 2488 | 32028.00000 | 0.55200 | 4401 | 87 | 0.77600 | 0.07200 |
| 4150 | 771342183 | Attrited Customer | 53 | F | 3 | Graduate | Single | $40K - $60K | Blue | 40 | 6 | 3 | 2 | 1625.00000 | 0 | 1625.00000 | 0.68900 | 2314 | 43 | 0.43300 | 0.00000 |
| 2205 | 708174708 | Existing Customer | 38 | M | 4 | Graduate | Married | $40K - $60K | Blue | 27 | 6 | 2 | 4 | 5535.00000 | 1276 | 4259.00000 | 0.63600 | 1764 | 38 | 0.90000 | 0.23100 |
| 4145 | 718076733 | Existing Customer | 43 | M | 1 | Graduate | Single | $60K - $80K | Silver | 31 | 4 | 3 | 3 | 25824.00000 | 1170 | 24654.00000 | 0.68400 | 3101 | 73 | 0.78000 | 0.04500 |
| 5324 | 821889858 | Attrited Customer | 50 | F | 1 | Doctorate | Single | abc | Blue | 46 | 6 | 4 | 3 | 1970.00000 | 1477 | 493.00000 | 0.66200 | 2493 | 44 | 0.57100 | 0.75000 |
| 7671 | 788885133 | Existing Customer | 57 | F | 3 | College | NaN | Less than $40K | Silver | 42 | 6 | 1 | 1 | 12675.00000 | 2422 | 10253.00000 | 0.64200 | 3912 | 63 | 0.57500 | 0.19100 |
| 2903 | 717427983 | Existing Customer | 50 | F | 3 | High School | Single | Less than $40K | Silver | 39 | 4 | 3 | 4 | 11176.00000 | 2422 | 8754.00000 | 1.00300 | 2682 | 64 | 0.64100 | 0.21700 |
| 9924 | 827318958 | Existing Customer | 49 | F | 4 | NaN | Single | Less than $40K | Blue | 45 | 5 | 3 | 2 | 9431.00000 | 1785 | 7646.00000 | 0.60300 | 14261 | 99 | 0.65000 | 0.18900 |
| 9251 | 708863733 | Existing Customer | 35 | M | 3 | NaN | Single | $40K - $60K | Gold | 19 | 1 | 1 | 3 | 22725.00000 | 808 | 21917.00000 | 0.77900 | 13847 | 120 | 0.87500 | 0.03600 |
| 2675 | 824667708 | Existing Customer | 32 | F | 1 | Graduate | Single | Less than $40K | Blue | 28 | 6 | 3 | 3 | 2428.00000 | 1027 | 1401.00000 | 0.82400 | 3007 | 84 | 0.86700 | 0.42300 |
| 7381 | 798281808 | Existing Customer | 48 | M | 4 | High School | Married | $60K - $80K | Blue | 42 | 5 | 4 | 3 | 3879.00000 | 1214 | 2665.00000 | 0.92400 | 5158 | 79 | 0.68100 | 0.31300 |
| 8660 | 720849708 | Existing Customer | 44 | M | 4 | Graduate | Single | $80K - $120K | Blue | 37 | 3 | 3 | 1 | 14388.00000 | 2517 | 11871.00000 | 0.70900 | 7042 | 85 | 0.63500 | 0.17500 |
| 7173 | 710157108 | Existing Customer | 42 | F | 3 | Graduate | Single | $40K - $60K | Blue | 33 | 6 | 2 | 3 | 3376.00000 | 1661 | 1715.00000 | 0.61800 | 4629 | 75 | 0.92300 | 0.49200 |
| 2844 | 720016308 | Existing Customer | 62 | F | 1 | NaN | Single | Less than $40K | Blue | 54 | 4 | 2 | 2 | 1927.00000 | 1319 | 608.00000 | 0.67200 | 4199 | 77 | 0.75000 | 0.68400 |
| 4427 | 816864483 | Existing Customer | 39 | F | 4 | Uneducated | Single | $40K - $60K | Blue | 34 | 4 | 2 | 2 | 10264.00000 | 803 | 9461.00000 | 0.80200 | 4003 | 74 | 0.72100 | 0.07800 |
The dataset looks consistent with the description provided in the Data Dictionary. Lots of data have values which need to be processed so the data can be effectively used for analysis
print(
f"There are {ccpredf1.shape[0]} rows and {ccpredf1.shape[1]} columns."
) # f-string
There are 10127 rows and 21 columns.
ccpredf1.dtypes.value_counts() # Gives count of the different datatype in dataset
int64 10 object 6 float64 5 dtype: int64
ccpredf1.info() # display complete info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
ccpredf1.isnull().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# let's check for duplicate values in the data
ccpredf1.duplicated().sum()
0
Number of unique values in each column
ccpredf1.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
ccpredf1.describe(
include="all"
).T # describe all dataset. Use transpose so column wise details can be view from top to bottompurpredf1.describe(include="all").T #describe all dataset. Use transpose so column wise details can be view from top to bottom
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.00000 | NaN | NaN | NaN | 739177606.33366 | 36903783.45023 | 708082083.00000 | 713036770.50000 | 717926358.00000 | 773143533.00000 | 828343083.00000 |
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Customer_Age | 10127.00000 | NaN | NaN | NaN | 46.32596 | 8.01681 | 26.00000 | 41.00000 | 46.00000 | 52.00000 | 73.00000 |
| Gender | 10127 | 2 | F | 5358 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 10127.00000 | NaN | NaN | NaN | 2.34620 | 1.29891 | 0.00000 | 1.00000 | 2.00000 | 3.00000 | 5.00000 |
| Education_Level | 8608 | 6 | Graduate | 3128 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 9378 | 3 | Married | 4687 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 10127 | 6 | Less than $40K | 3561 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 10127 | 4 | Blue | 9436 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_on_book | 10127.00000 | NaN | NaN | NaN | 35.92841 | 7.98642 | 13.00000 | 31.00000 | 36.00000 | 40.00000 | 56.00000 |
| Total_Relationship_Count | 10127.00000 | NaN | NaN | NaN | 3.81258 | 1.55441 | 1.00000 | 3.00000 | 4.00000 | 5.00000 | 6.00000 |
| Months_Inactive_12_mon | 10127.00000 | NaN | NaN | NaN | 2.34117 | 1.01062 | 0.00000 | 2.00000 | 2.00000 | 3.00000 | 6.00000 |
| Contacts_Count_12_mon | 10127.00000 | NaN | NaN | NaN | 2.45532 | 1.10623 | 0.00000 | 2.00000 | 2.00000 | 3.00000 | 6.00000 |
| Credit_Limit | 10127.00000 | NaN | NaN | NaN | 8631.95370 | 9088.77665 | 1438.30000 | 2555.00000 | 4549.00000 | 11067.50000 | 34516.00000 |
| Total_Revolving_Bal | 10127.00000 | NaN | NaN | NaN | 1162.81406 | 814.98734 | 0.00000 | 359.00000 | 1276.00000 | 1784.00000 | 2517.00000 |
| Avg_Open_To_Buy | 10127.00000 | NaN | NaN | NaN | 7469.13964 | 9090.68532 | 3.00000 | 1324.50000 | 3474.00000 | 9859.00000 | 34516.00000 |
| Total_Amt_Chng_Q4_Q1 | 10127.00000 | NaN | NaN | NaN | 0.75994 | 0.21921 | 0.00000 | 0.63100 | 0.73600 | 0.85900 | 3.39700 |
| Total_Trans_Amt | 10127.00000 | NaN | NaN | NaN | 4404.08630 | 3397.12925 | 510.00000 | 2155.50000 | 3899.00000 | 4741.00000 | 18484.00000 |
| Total_Trans_Ct | 10127.00000 | NaN | NaN | NaN | 64.85869 | 23.47257 | 10.00000 | 45.00000 | 67.00000 | 81.00000 | 139.00000 |
| Total_Ct_Chng_Q4_Q1 | 10127.00000 | NaN | NaN | NaN | 0.71222 | 0.23809 | 0.00000 | 0.58200 | 0.70200 | 0.81800 | 3.71400 |
| Avg_Utilization_Ratio | 10127.00000 | NaN | NaN | NaN | 0.27489 | 0.27569 | 0.00000 | 0.02300 | 0.17600 | 0.50300 | 0.99900 |
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to label barplot
def labeled_barplot(data, feature, perc=False, n=None):
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / total # width of the plot
y = p.get_y() + p.get_height() + 1.5 # height of the plot
ax.annotate(label, (x, y), size=12, rotation=0) # annotate the percentage
plt.show() # show the plot
labeled_barplot(ccpredf1, "Attrition_Flag", perc=True) # barplot
Almost 16% of customers have attrited.
histogram_boxplot(ccpredf1, "Customer_Age") # boxplot
Age has an approx symmetric distribution with mean and median almost nearby There are not many outliers in the distribution for this variable
labeled_barplot(ccpredf1, "Gender", perc=True) # barplot
There are 52.9% Females and 47.1% Males
labeled_barplot(ccpredf1, "Dependent_count", perc=True) # barplot
More than 91% customers have dependents
labeled_barplot(ccpredf1, "Education_Level", perc=True) # barplot
There are 14.7% customers uneducated. There are many missing values that need to be investigated.
labeled_barplot(ccpredf1, "Marital_Status", perc=True) # barplot
There are many missing values that need to be investigated.
labeled_barplot(ccpredf1, "Income_Category", perc=True) # barplot
Most of the customers have income less than $40K. ABC value need to be investigated.
labeled_barplot(ccpredf1, "Card_Category", perc=True) # barplot
93% of customers have blue card.
labeled_barplot(ccpredf1, "Months_on_book", perc=True) # barplot
Almost 25% of candidates have 36 month engagement with bank. Need to understand what is unique about 36 months engagement and why its higher than all other values.
labeled_barplot(ccpredf1, "Total_Relationship_Count", perc=True) # barplot
Almost 80% of candidates have more than 2 products with the bank.
labeled_barplot(ccpredf1, "Months_Inactive_12_mon", perc=True) # barplot
Max inactivity in the data is 6 months.
labeled_barplot(ccpredf1, "Contacts_Count_12_mon", perc=True) # barplot
Approx 80% of candidates have 3 or less number of contracts.
histogram_boxplot(ccpredf1, "Credit_Limit") # boxplot
The credit limit has high variation and right skewed.
The pair plot between the values in dataset show the overall distribution against each of the columns. This give an overall view against the selected dataset values.
sns.pairplot(ccpredf1, hue="Attrition_Flag")
# pair plot multiple pair wise bivariate distributions
<seaborn.axisgrid.PairGrid at 0x1bd26f665e0>
plt.figure(figsize=(10, 5))
sns.boxplot(
x="Customer_Age", y="Attrition_Flag", data=ccpredf1
) # plot to assess important variables
plt.show()
Age of customer does not appear to be impacting the attrition
plt.figure(figsize=(10, 5))
sns.countplot(x="Attrition_Flag", hue="Gender", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Attrition_Flag', ylabel='count'>
Female customer are more who attritioned but it seems proportional to the number of female customers
plt.figure(figsize=(10, 5))
sns.countplot(x="Attrition_Flag", hue="Dependent_count", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Attrition_Flag', ylabel='count'>
plt.figure(figsize=(10, 5))
sns.countplot(x="Dependent_count", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Dependent_count', ylabel='count'>
No of dependents and attrition has not very strong correlation
plt.figure(figsize=(10, 5))
sns.countplot(x="Education_Level", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Education_Level', ylabel='count'>
No of dependents and Education level has not very strong correlation. At Post Graduate Level and doctorate level the attrition % is higher
plt.figure(figsize=(10, 5))
sns.countplot(x="Marital_Status", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Marital_Status', ylabel='count'>
For Divorced the attrition % is litter higher than others
plt.figure(figsize=(10, 5))
sns.countplot(x="Income_Category", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Income_Category', ylabel='count'>
Attrition % is higher for income groups greater than 40K. Significance of ABC need to assessed and treated accordingly.
plt.figure(figsize=(10, 5))
sns.countplot(x="Card_Category", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Card_Category', ylabel='count'>
plt.figure(figsize=(10, 5))
sns.countplot(x="Months_on_book", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Months_on_book', ylabel='count'>
plt.figure(figsize=(10, 5))
sns.countplot(x="Total_Relationship_Count", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Total_Relationship_Count', ylabel='count'>
plt.figure(figsize=(10, 5))
sns.countplot(x="Months_Inactive_12_mon", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Months_Inactive_12_mon', ylabel='count'>
plt.figure(figsize=(10, 5))
sns.countplot(x="Contacts_Count_12_mon", hue="Attrition_Flag", data=ccpredf1)
# plot to assess important variables
<AxesSubplot:xlabel='Contacts_Count_12_mon', ylabel='count'>
plt.figure(figsize=(20, 8))
sns.heatmap(ccpredf1.corr(), annot=True) # Heatmap
plt.show()
# creating a copy of the data so that original data remains unchanged
ccpredf2 = ccpredf1.copy()
def missingvaluecheck(dat): # function to find missing values within the dataset
calc_total = (
dat.isnull().sum().sort_values(ascending=False)
) # Checking for total null values
calc_percent = (dat.isnull().sum() / dat.isnull().count()).sort_values(
ascending=False
) # checking percentage
data_missing = pd.concat(
[calc_total, calc_percent], axis=1, keys=["Total", "% of Missing Val"]
) # to display missing data count and percentage
return data_missing # return the dataframe
missingvaluecheck(ccpredf2)
| Total | % of Missing Val | |
|---|---|---|
| Education_Level | 1519 | 0.15000 |
| Marital_Status | 749 | 0.07396 |
| Avg_Utilization_Ratio | 0 | 0.00000 |
| Months_on_book | 0 | 0.00000 |
| Attrition_Flag | 0 | 0.00000 |
| Customer_Age | 0 | 0.00000 |
| Gender | 0 | 0.00000 |
| Dependent_count | 0 | 0.00000 |
| Income_Category | 0 | 0.00000 |
| Card_Category | 0 | 0.00000 |
| Total_Relationship_Count | 0 | 0.00000 |
| Total_Ct_Chng_Q4_Q1 | 0 | 0.00000 |
| Months_Inactive_12_mon | 0 | 0.00000 |
| Contacts_Count_12_mon | 0 | 0.00000 |
| Credit_Limit | 0 | 0.00000 |
| Total_Revolving_Bal | 0 | 0.00000 |
| Avg_Open_To_Buy | 0 | 0.00000 |
| Total_Amt_Chng_Q4_Q1 | 0 | 0.00000 |
| Total_Trans_Amt | 0 | 0.00000 |
| Total_Trans_Ct | 0 | 0.00000 |
| CLIENTNUM | 0 | 0.00000 |
ccpredf2.drop(["CLIENTNUM"], axis=1, inplace=True) # CustomerID dropped
# Replacing ABC with
ccpredf2["Income_Category"] = ccpredf2["Income_Category"].replace("abc", "Not Declared")
ccpredf3 = ccpredf2.copy()
ccpredf3.shape
(10127, 20)
ccpredf3 = pd.get_dummies(
ccpredf3,
columns=ccpredf3.select_dtypes(include=["object", "category"]).columns.tolist(),
drop_first=True,
)
np.random.seed(
1
) # random set with seed 1, so the same set of random values can be generated everytime
ccpredf3.sample(n=10) # Print 10 random rows to check the dataset
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Attrition_Flag_Existing Customer | Gender_M | Education_Level_Doctorate | Education_Level_Graduate | Education_Level_High School | Education_Level_Post-Graduate | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Income_Category_Not Declared | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6498 | 43 | 2 | 36 | 6 | 3 | 2 | 2570.00000 | 2107 | 463.00000 | 0.65100 | 4058 | 83 | 0.76600 | 0.82000 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 9013 | 38 | 1 | 32 | 2 | 3 | 3 | 2609.00000 | 1259 | 1350.00000 | 0.87100 | 8677 | 96 | 0.62700 | 0.48300 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2053 | 39 | 2 | 31 | 6 | 3 | 2 | 9871.00000 | 1061 | 8810.00000 | 0.54500 | 1683 | 34 | 0.47800 | 0.10700 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3211 | 44 | 4 | 32 | 6 | 3 | 4 | 34516.00000 | 2517 | 31999.00000 | 0.76500 | 4228 | 83 | 0.59600 | 0.07300 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5559 | 38 | 2 | 28 | 5 | 2 | 4 | 1614.00000 | 0 | 1614.00000 | 0.60900 | 2437 | 46 | 0.43800 | 0.00000 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 6106 | 54 | 3 | 42 | 3 | 1 | 2 | 34516.00000 | 2488 | 32028.00000 | 0.55200 | 4401 | 87 | 0.77600 | 0.07200 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4150 | 53 | 3 | 40 | 6 | 3 | 2 | 1625.00000 | 0 | 1625.00000 | 0.68900 | 2314 | 43 | 0.43300 | 0.00000 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2205 | 38 | 4 | 27 | 6 | 2 | 4 | 5535.00000 | 1276 | 4259.00000 | 0.63600 | 1764 | 38 | 0.90000 | 0.23100 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4145 | 43 | 1 | 31 | 4 | 3 | 3 | 25824.00000 | 1170 | 24654.00000 | 0.68400 | 3101 | 73 | 0.78000 | 0.04500 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 5324 | 50 | 1 | 46 | 6 | 4 | 3 | 1970.00000 | 1477 | 493.00000 | 0.66200 | 2493 | 44 | 0.57100 | 0.75000 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
ccpredf3.shape
(10127, 31)
# Separating target variable and other variables
X = ccpredf3.drop(columns="Attrition_Flag_Existing Customer")
X = pd.get_dummies(X)
Y = ccpredf3["Attrition_Flag_Existing Customer"]
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, Y, test_size=0.2, random_state=1, stratify=Y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 30) (2026, 30) (2026, 30)
imputer = SimpleImputer(strategy="median")
impute = imputer.fit(X_train)
X_train = impute.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 97.52921934230022 Random forest: 98.74494410129117 GBM: 98.78412130308453 Adaboost: 97.92122226712078 Xgboost: 98.68606284515768 dtree: 96.07796955877541 Training Performance: Bagging: 99.74504804863699 Random forest: 100.0 GBM: 99.27436752304374 Adaboost: 98.25455971759169 Xgboost: 100.0 dtree: 100.0
Performance of all the models are similar.
print("Before UpSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label '0': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())
print("After UpSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_train_res == 0)))
Before UpSampling, counts of label '1': 5099 Before UpSampling, counts of label '0': 976 After UpSampling, counts of label '1': 5099 After UpSampling, counts of label '0': 5099
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_res, y=y_train_res, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train_res, y_train_res)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 95.62691219765628 Random forest: 97.56830033289076 GBM: 97.68592814947372 Adaboost: 96.33256364371067 Xgboost: 98.25463256941639 dtree: 94.76393619273027 Training Performance: Bagging: 99.52931947440675 Random forest: 100.0 GBM: 98.35261816042362 Adaboost: 96.78368307511278 Xgboost: 100.0 dtree: 100.0
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)
X_cc.shape
(1952, 30)
y_cc
0 0
1 0
2 0
3 0
4 0
..
1947 1
1948 1
1949 1
1950 1
1951 1
Name: Attrition_Flag_Existing Customer, Length: 1952, dtype: uint8
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_cc, y=y_cc, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_cc, y_cc)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 95.69649398220828 Random forest: 95.90266875981162 GBM: 97.02825745682888 Adaboost: 96.10779696493982 Xgboost: 97.33647305075877 dtree: 94.46781789638933 Training Performance: Bagging: 66.15022553441851 Random forest: 53.22612276917042 GBM: 55.42263188860561 Adaboost: 58.560502059227304 Xgboost: 58.71739556775838 dtree: 62.69856834673465
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
Best Parameters:{'base_estimator': DecisionTreeClassifier(max_depth=1, random_state=1), 'learning_rate': 0.1, 'n_estimators': 10}
Score: 1.0
Wall time: 1min 58s
# building model with best parameters
adb_tuned1 = AdaBoostClassifier(
n_estimators=10,
learning_rate=0.1,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=1, random_state=1),
)
# Fit the model on training data
adb_tuned1.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.1, n_estimators=10, random_state=1)
# Calculating different metrics on train set
Adaboost_grid_train = model_performance_classification_sklearn(
adb_tuned1, X_train, y_train
)
print("Training performance:")
Adaboost_grid_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.83934 | 1.00000 | 0.83934 | 0.91265 |
# Calculating different metrics on validation set
Adaboost_grid_val = model_performance_classification_sklearn(adb_tuned1, X_val, y_val)
print("Validation performance:")
Adaboost_grid_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.83909 | 1.00000 | 0.83909 | 0.91251 |
# creating confusion matrix
confusion_matrix_sklearn(adb_tuned1, X_val, y_val)
Model performance is good
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'n_estimators': 50, 'learning_rate': 0.01, 'base_estimator': DecisionTreeClassifier(max_depth=1, random_state=1)} with CV score=1.0:
Wall time: 43 s
# building model with best parameters
adb_tuned2 = AdaBoostClassifier(
n_estimators=50,
learning_rate=0.01,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=1, random_state=1),
)
# Fit the model on training data
adb_tuned2.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.01, random_state=1)
# Calculating different metrics on train set
Adaboost_random_train = model_performance_classification_sklearn(
adb_tuned2, X_train, y_train
)
print("Training performance:")
Adaboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.83934 | 1.00000 | 0.83934 | 0.91265 |
# Calculating different metrics on validation set
Adaboost_random_val = model_performance_classification_sklearn(adb_tuned2, X_val, y_val)
print("Validation performance:")
Adaboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.83909 | 1.00000 | 0.83909 | 0.91251 |
# creating confusion matrix
confusion_matrix_sklearn(adb_tuned2, X_val, y_val)
Performance is similar to the GridsearchCV
%%time
#defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
#Parameter grid to pass in GridSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
#Fitting parameters in GridSeachCV
grid_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(grid_cv.best_params_,grid_cv.best_score_))
Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best parameters are {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50, 'reg_lambda': 5, 'scale_pos_weight': 2, 'subsample': 0.8} with CV score=1.0:
Wall time: 20min 8s
# building model with best parameters
xgb_tuned1 = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=2,
subsample=0.8,
learning_rate=0.01,
gamma=0,
eval_metric="logloss",
reg_lambda=5,
max_depth=1,
)
# Fit the model on training data
xgb_tuned1.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=5,
scale_pos_weight=2, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_grid_train = model_performance_classification_sklearn(
xgb_tuned1, X_train, y_train
)
print("Training performance:")
xgboost_grid_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.83934 | 1.00000 | 0.83934 | 0.91265 |
# Calculating different metrics on validation set
xgboost_grid_val = model_performance_classification_sklearn(xgb_tuned1, X_val, y_val)
print("Validation performance:")
xgboost_grid_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.83909 | 1.00000 | 0.83909 | 0.91251 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned1, X_val, y_val)
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.8, 'scale_pos_weight': 2, 'reg_lambda': 10, 'n_estimators': 100, 'max_depth': 1, 'learning_rate': 0.01, 'gamma': 5} with CV score=1.0:
Wall time: 25.9 s
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=2,
gamma=5,
subsample=0.8,
learning_rate=0.1,
eval_metric="logloss",
max_depth=1,
reg_lambda=10,
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=2, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.89333 | 0.99686 | 0.88941 | 0.94008 |
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.88648 | 0.99882 | 0.88162 | 0.93657 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
Adaboost_grid_train.T,
Adaboost_random_train.T,
xgboost_grid_train.T,
xgboost_random_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost Tuned with Grid search",
"AdaBoost Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| AdaBoost Tuned with Grid search | AdaBoost Tuned with Random search | Xgboost Tuned with Grid search | Xgboost Tuned with Random Search | |
|---|---|---|---|---|
| Accuracy | 0.83934 | 0.83934 | 0.83934 | 0.89333 |
| Recall | 1.00000 | 1.00000 | 1.00000 | 0.99686 |
| Precision | 0.83934 | 0.83934 | 0.83934 | 0.88941 |
| F1 | 0.91265 | 0.91265 | 0.91265 | 0.94008 |
# Validation performance comparison
models_val_comp_df = pd.concat(
[
Adaboost_grid_val.T,
Adaboost_random_val.T,
xgboost_grid_val.T,
xgboost_random_val.T,
],
axis=1,
)
models_val_comp_df.columns = [
"AdaBoost Tuned with Grid search",
"AdaBoost Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| AdaBoost Tuned with Grid search | AdaBoost Tuned with Random search | Xgboost Tuned with Grid search | Xgboost Tuned with Random Search | |
|---|---|---|---|---|
| Accuracy | 0.83909 | 0.83909 | 0.83909 | 0.88648 |
| Recall | 1.00000 | 1.00000 | 1.00000 | 0.99882 |
| Precision | 0.83909 | 0.83909 | 0.83909 | 0.88162 |
| F1 | 0.91251 | 0.91251 | 0.91251 | 0.93657 |
Xgboost with random search has best performance.
# Calculating different metrics on the test set
xgboost_rand_test = model_performance_classification_sklearn(xgb_tuned2, X_test, y_test)
print("Test performance:")
xgboost_rand_test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.88598 | 0.99706 | 0.88241 | 0.93624 |
The model is not overfit and is working well on the test data.
feature_names = X.columns
importances = xgb_tuned2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="blue", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Total_Ct_Chng_Q4_Q1, Total_Revolving_Bal, Total_Trans_Ct, Total_Relationship_Count, Months_Inactive_12_mon, Contacts_Count_12_mon and Total_Trans_Amt are most important features as per the selected model
# Separating target variable and other variables
X = ccpredf3.drop(columns="Attrition_Flag_Existing Customer")
Y = ccpredf3["Attrition_Flag_Existing Customer"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 30) (3039, 30)
from sklearn.pipeline import Pipeline
# Creating new pipeline with best parameters
model = Pipeline(steps=[("ADA",XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=1, min_child_weight=1,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=2, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None),
),])
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('ADA',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1,
max_delta_step=0, max_depth=1,
min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100,
n_jobs=8, num_parallel_tree=1, random_state=1,
reg_alpha=0, reg_lambda=10, scale_pos_weight=2,
subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None))])